library(plotly)
library(data.table)
library(tidyr)
library(knitr)
library(heatmaply)
Preprocessing
- Load data file
- rename genres for better readability
- “Religion, Spirituality & New Age” to “Religion”
- “Science.fiction” to “SciFi”
- “Action.and.Adventure” to “Action”
All genres:
[1] "Satire" "SciFi" "Drama" "Action"
[5] "Romance" "Mystery" "Horror" "Self.help"
[9] "Health" "Guide" "Travel" "Children.s"
[13] "Religion" "Science" "History" "Math"
[17] "Anthology" "Poetry" "Encyclopedias" "Dictionaries"
[21] "Comics" "Art" "Cookbooks" "Diaries"
[25] "Journals"
- Check if upper and lower triangle identical
[1] TRUE
- Transform to long and tidy
data.table
head(books_dt)
- Average number of genres per customer
[1] 2.332187
First ideas
Show me everything!
- Romance, SciFi, Action, History are most bought
- bought-together clusters:
- Romance, SciFi, Action, History
- Dictionaries and Comics
- Math and Poetry
- Mystery is an outlier
Best pairs
- mostly combinations of most bought genres
Special genres
Hypothesis
- If a customer buys more than 2 genres, he is recorded in more than 1 off-diagonal entry –> (2*diagonal - colSum) < 0
- If a genre is bought more often alone than in triplets (or higher): (2*diagonal - colSum) > 0
Look for customers that buy only one genre
- Compare
column sum and 2*diagonal value
- generate table with
{genreA=<genre>, genreB=NA, customers={2*diagonal-colSum}}
Error in eval(expr, envir, enclos) : object 'diag_diff' not found
LS0tCnRpdGxlOiAiQWxsaWFueiBEYXRhVml6IENoYWxsZW5nZSIKYXV0aG9yOiAiRGFuaWVsIEJhZGVyIgpvdXRwdXQ6CiAgaHRtbF9ub3RlYm9vazoKICAgIHRvYzogeWVzCiAgICB0b2NfZmxvYXQ6IHllcwogIGh0bWxfZG9jdW1lbnQ6CiAgICB0b2M6IHllcwogICAgdG9jX2Zsb2F0OiB5ZXMKLS0tCgpgYGB7ciwgbWVzc2FnZT1GQUxTRSwgZWNobz1UfQpsaWJyYXJ5KHBsb3RseSkKbGlicmFyeShkYXRhLnRhYmxlKQpsaWJyYXJ5KHRpZHlyKQpsaWJyYXJ5KGtuaXRyKQpsaWJyYXJ5KGhlYXRtYXBseSkKYGBgCgoKYGBge3IsIGVjaG89RkFMU0V9Cm9wdHNfY2h1bmskc2V0KGVjaG89RkFMU0UsIGNhY2hlPUYpCnRvdGFsX2N1c3RvbWVycyA8LSAxOTUzODcKZmlsZV9ib29rc3RvcmUgPC0gZmlsZS5wYXRoKCJ+L0Rvd25sb2Fkcy90b3lkYXRhL2Jvb2tfZ2VucmVzX2RhdGEuY3N2IikKc291cmNlKCJidWlsZF9ib29rX3N0b3JlLlIiKQpgYGAKCgojIFByZXByb2Nlc3NpbmcKCiogTG9hZCBkYXRhIGZpbGUKKiByZW5hbWUgZ2VucmVzIGZvciBiZXR0ZXIgcmVhZGFiaWxpdHkKICAgICogIlJlbGlnaW9uLCBTcGlyaXR1YWxpdHkgJiBOZXcgQWdlIiB0byAiUmVsaWdpb24iCiAgICAqICJTY2llbmNlLmZpY3Rpb24iIHRvICJTY2lGaSIKICAgICogIkFjdGlvbi5hbmQuQWR2ZW50dXJlIiB0byAiQWN0aW9uIgogICAgCkFsbCBnZW5yZXM6CmBgYHtyfQpib29rc19tYXQgPC0gcmVhZC5jc3YoZmlsZV9ib29rc3RvcmUsIHJvdy5uYW1lcyA9IDEpCnJvd25hbWVzKGJvb2tzX21hdCkgPC0gbWFrZS5uYW1lcyhyb3duYW1lcyhib29rc19tYXQpKQpyb3duYW1lcyhib29rc19tYXQpIDwtIHN1YigiU2NpZW5jZS5maWN0aW9uIiwgIlNjaUZpIiwgcm93bmFtZXMoYm9va3NfbWF0KSkKcm93bmFtZXMoYm9va3NfbWF0KSA8LSBzdWIoIkFjdGlvbi5hbmQuQWR2ZW50dXJlIiwgIkFjdGlvbiIsIHJvd25hbWVzKGJvb2tzX21hdCkpCnJvd25hbWVzKGJvb2tzX21hdCkgPC0gc3ViKCJSZWxpZ2lvbi4uU3Bpcml0dWFsaXR5Li4uTmV3LkFnZSIsIAogICAgIlJlbGlnaW9uIiwgcm93bmFtZXMoYm9va3NfbWF0KQopCmNvbG5hbWVzKGJvb2tzX21hdCkgPC0gcm93bmFtZXMoYm9va3NfbWF0KQpyb3duYW1lcyhib29rc19tYXQpCmBgYAoKKiBDaGVjayBpZiB1cHBlciBhbmQgbG93ZXIgdHJpYW5nbGUgaWRlbnRpY2FsCgpgYGB7cn0KaXNfdXBwZXJfbG93ZXIgPC0gaWRlbnRpY2FsKAogICAgYm9va3NfbWF0W3VwcGVyLnRyaShib29rc19tYXQpXSwgCiAgICB0KGJvb2tzX21hdClbdXBwZXIudHJpKGJvb2tzX21hdCldCikKaXNfdXBwZXJfbG93ZXIKYGBgCgoqIFRyYW5zZm9ybSB0byBsb25nIGFuZCB0aWR5IGBkYXRhLnRhYmxlYAoKYGBge3J9CmJvb2tzX2R0IDwtIGFzLmRhdGEudGFibGUoYm9va3NfbWF0LCBrZWVwLnJvd25hbWVzID0gVFJVRSkKc2V0bmFtZXMoYm9va3NfZHQsIGMoJ2dlbnJlQScsY29sbmFtZXMoYm9va3NfbWF0KSkpCmJvb2tzX2R0IDwtIGFzLmRhdGEudGFibGUoZ2F0aGVyKGJvb2tzX2R0LCBnZW5yZUIsIGN1c3RvbWVycywgU2F0aXJlOkpvdXJuYWxzKSkKYGBgCgpgYGB7ciwgZWNobz1UfQpoZWFkKGJvb2tzX2R0KQpgYGAKCgoqIEF2ZXJhZ2UgbnVtYmVyIG9mIGdlbnJlcyBwZXIgY3VzdG9tZXIKCmBgYHtyfQpzdW0oYm9va3NfZHRbZ2VucmVBPT1nZW5yZUIsIGN1c3RvbWVyc10pL3RvdGFsX2N1c3RvbWVycwpgYGAKCgojIEZpcnN0IGlkZWFzCgojIyBTaG93IG1lIGV2ZXJ5dGhpbmchCgpgYGB7ciwgZmlnLndpZHRoPTgsIGZpZy5oZWlnaHQ9OH0KaG0gPC0gaGVhdG1hcHIoYm9va3NfbWF0KQpoZWF0bWFwbHkoaG0sIAogICAgcGxvdF9tZXRob2QgPSAncGxvdGx5JywgCiAgICBjb2xvcnMgPSAgYygnZ3JleTk1JywgJ2RvZGdlcmJsdWUnKQopCmBgYAoKKiBSb21hbmNlLCBTY2lGaSwgQWN0aW9uLCBIaXN0b3J5IGFyZSBtb3N0IGJvdWdodCAKKiBib3VnaHQtdG9nZXRoZXIgY2x1c3RlcnM6CiAgICAqIFJvbWFuY2UsIFNjaUZpLCBBY3Rpb24sIEhpc3RvcnkKICAgICogRGljdGlvbmFyaWVzIGFuZCBDb21pY3MKICAgICogTWF0aCBhbmQgUG9ldHJ5CiogTXlzdGVyeSBpcyBhbiBvdXRsaWVyCgojIyBNb3N0IGJvdWdodCBnZW5yZQoKYGBge3J9CnBsb3RfbHkoZGF0YT1ib29rc19kdFtnZW5yZUE9PWdlbnJlQl1bb3JkZXIoY3VzdG9tZXJzKV0sIAogICAgeD1+Z2VucmVBLCB5PX5jdXN0b21lcnMsIHR5cGU9ImJhciIKKSU+JSBsYXlvdXQoCiAgICBtYXJnaW49bGlzdChiPTEwMCksIAogICAgeGF4aXM9bGlzdChjYXRlZ29yeW9yZGVyPSJ0cmFjZSIpLAogICAgdGl0bGU9Ik1vc3QgYm91Z2h0IGdlbnJlIgopCmBgYAoKIyMgQmVzdCBwYWlycwoKYGBge3J9CmFsbF9nZW5yZXMgPC0gdW5pcXVlKGJvb2tzX2R0JGdlbnJlQSkKYWxsX3BhaXJzIDwtIGNvbWJuKGFsbF9nZW5yZXMsIDIsIHNpbXBsaWZ5ID0gRikKcGFpcl9jdXN0b21lcnMgPC0gCnBhaXJfZHQgPC0gZGF0YS50YWJsZSgKICAgIGdlbnJlX3BhaXJzID0gc2FwcGx5KGFsbF9wYWlycywgZnVuY3Rpb24ocCl7CiAgICAgICAgcGFzdGUoc29ydChwKSwgY29sbGFwc2UgPSAiJiIpfQogICAgKSwKICAgIHBhaXJfY3VzdG9tZXJzID0gc2FwcGx5KGFsbF9wYWlycywgZnVuY3Rpb24ocCl7CiAgICAgICAgYm9va3NfZHRbZ2VucmVBPT1wWzFdICYgZ2VucmVCPT1wWzJdLCBjdXN0b21lcnNdCiAgICB9KQopCnBsb3RfbHkoZGF0YT1wYWlyX2R0W29yZGVyKHBhaXJfY3VzdG9tZXJzLCBkZWNyZWFzaW5nPVQpXVsxOjEwXSwgdHlwZT0nYmFyJywgCiAgICB4PX5nZW5yZV9wYWlycywgeT1+cGFpcl9jdXN0b21lcnMKKSU+JSBsYXlvdXQoCiAgICBtYXJnaW49bGlzdChiPTEwMCksIAogICAgeGF4aXM9bGlzdChjYXRlZ29yeW9yZGVyPSJ0cmFjZSIpLAogICAgdGl0bGU9IlRvcCAxMCBnZW5yZSBwYWlycyIKKQpgYGAKCiogbW9zdGx5IGNvbWJpbmF0aW9ucyBvZiBtb3N0IGJvdWdodCBnZW5yZXMKCgojIFNwZWNpYWwgZ2VucmVzCgpIeXBvdGhlc2lzCgoqIElmIGEgY3VzdG9tZXIgYnV5cyBtb3JlIHRoYW4gMiBnZW5yZXMsIApoZSBpcyByZWNvcmRlZCBpbiBtb3JlIHRoYW4gMSBvZmYtZGlhZ29uYWwgZW50cnkKLS0+ICgyKmRpYWdvbmFsIC0gY29sU3VtKSA8IDAKKiBJZiBhIGdlbnJlIGlzIGJvdWdodCBtb3JlIG9mdGVuIGFsb25lIHRoYW4gaW4gdHJpcGxldHMgKG9yIGhpZ2hlcik6ICgyKmRpYWdvbmFsIC0gY29sU3VtKSA+IDAKCgpMb29rIGZvciBjdXN0b21lcnMgdGhhdCBidXkgb25seSBvbmUgZ2VucmUKCiogQ29tcGFyZSBgY29sdW1uIHN1bWAgYW5kICBgMipkaWFnb25hbCB2YWx1ZWAKKiBnZW5lcmF0ZSB0YWJsZSB3aXRoIApge2dlbnJlQT08Z2VucmU+LCBnZW5yZUI9TkEsICBjdXN0b21lcnM9ezIqZGlhZ29uYWwtY29sU3VtfX1gCgoKYGBge3J9CmFsbF9nZW5yZXMgPC0gdW5pcXVlKGJvb2tzX2R0JGdlbnJlQSkKc2VsZWN0aXZlX2R0IDwtIGRhdGEudGFibGUoKQp0bXAgPC0gc2FwcGx5KGFsbF9nZW5yZXMsIGZ1bmN0aW9uKGcpewogICAgZCA8LSBib29rc19kdFtnZW5yZUE9PWcgJiBnZW5yZUI9PWcsIGN1c3RvbWVyc10KICAgIGNzIDwtIHN1bShib29rc19kdFtnZW5yZUE9PWcsIGN1c3RvbWVyc10pCiAgICBzZWxlY3RpdmVfY3VzdG9tZXJzIDwtIEkoMipkIC0gY3MpIztzZWxlY3RpdmVfY3VzdG9tZXJzCiAgICBzZWxlY3RpdmVfZHQgPDwtIHJiaW5kKHNlbGVjdGl2ZV9kdCwgCiAgICAgICAgZGF0YS50YWJsZShnZW5yZUE9ZywgZ2VucmVCPU5BLCBkaWFnX2RpZmY9c2VsZWN0aXZlX2N1c3RvbWVycykKICAgICkKfSkKCnBfc2VsIDwtIHBsb3RfbHkoZGF0YT1zZWxlY3RpdmVfZHRbb3JkZXIoZGlhZ19kaWZmKV0sIAogICAgeT1+Z2VucmVBLCB4PX5kaWFnX2RpZmYsIHR5cGU9ImJhciIsIGNvbG9yID0gfmRpYWdfZGlmZj4wCiklPiUgbGF5b3V0KAogICAgbWFyZ2luPWxpc3QobD0xMDApLCAKICAgIHlheGlzPWxpc3QoY2F0ZWdvcnlvcmRlcj0idHJhY2UiLCB0aXRsZT0nJyksCiAgICB4YXhpcz1saXN0KHRpdGxlPScyKmRpYWdvbmFsIC0gY29sdW1uU3VtJyksCiAgICB0aXRsZT0iV2hpY2ggZ2VucmVzIGFyZSBib3VnaHQgYWxvbmU/IgopCmBgYApgYGB7cix3YXJuaW5nPUZBTFNFLCBmaWcud2lkdGg9OH0Kc2hvdyhwX3NlbCkKYGBgCgo=